Group Project Wilson & Mills¶
In [2]:
#Libraries Used
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
import missingno as msno
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.formula.api import ols
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
import plotly.express as px
Import Data and Review¶
In [4]:
#Import Dataset
original_df = pd.read_csv("US_Accidents_March23.csv")
original_df.columns.tolist
Out[4]:
<bound method IndexOpsMixin.tolist of Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
'Astronomical_Twilight'],
dtype='object')>
In [5]:
#Data Characteristics
original_df.head(5)
Out[5]:
| ID | Source | Severity | Start_Time | End_Time | Start_Lat | Start_Lng | End_Lat | End_Lng | Distance(mi) | ... | Roundabout | Station | Stop | Traffic_Calming | Traffic_Signal | Turning_Loop | Sunrise_Sunset | Civil_Twilight | Nautical_Twilight | Astronomical_Twilight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A-1 | Source2 | 3 | 2016-02-08 05:46:00 | 2016-02-08 11:00:00 | 39.865147 | -84.058723 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Night | Night | Night |
| 1 | A-2 | Source2 | 2 | 2016-02-08 06:07:59 | 2016-02-08 06:37:59 | 39.928059 | -82.831184 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Night | Night | Day |
| 2 | A-3 | Source2 | 2 | 2016-02-08 06:49:27 | 2016-02-08 07:19:27 | 39.063148 | -84.032608 | NaN | NaN | 0.01 | ... | False | False | False | False | True | False | Night | Night | Day | Day |
| 3 | A-4 | Source2 | 3 | 2016-02-08 07:23:34 | 2016-02-08 07:53:34 | 39.747753 | -84.205582 | NaN | NaN | 0.01 | ... | False | False | False | False | False | False | Night | Day | Day | Day |
| 4 | A-5 | Source2 | 2 | 2016-02-08 07:39:07 | 2016-02-08 08:09:07 | 39.627781 | -84.188354 | NaN | NaN | 0.01 | ... | False | False | False | False | True | False | Day | Day | Day | Day |
5 rows × 46 columns
In [6]:
#Data Characteristics
original_df.describe(include = 'all')
Out[6]:
| ID | Source | Severity | Start_Time | End_Time | Start_Lat | Start_Lng | End_Lat | End_Lng | Distance(mi) | ... | Roundabout | Station | Stop | Traffic_Calming | Traffic_Signal | Turning_Loop | Sunrise_Sunset | Civil_Twilight | Nautical_Twilight | Astronomical_Twilight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7728394 | 7728394 | 7.728394e+06 | 7728394 | 7728394 | 7.728394e+06 | 7.728394e+06 | 4.325632e+06 | 4.325632e+06 | 7.728394e+06 | ... | 7728394 | 7728394 | 7728394 | 7728394 | 7728394 | 7728394 | 7705148 | 7705148 | 7705148 | 7705148 |
| unique | 7728394 | 3 | NaN | 6131796 | 6705355 | NaN | NaN | NaN | NaN | NaN | ... | 2 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 |
| top | A-1 | Source1 | NaN | 2021-01-26 16:16:13 | 2021-11-22 08:00:00 | NaN | NaN | NaN | NaN | NaN | ... | False | False | False | False | False | False | Day | Day | Day | Day |
| freq | 1 | 4325632 | NaN | 225 | 112 | NaN | NaN | NaN | NaN | NaN | ... | 7728145 | 7526493 | 7514023 | 7720796 | 6584622 | 7728394 | 5334553 | 5695619 | 6076156 | 6377548 |
| mean | NaN | NaN | 2.212384e+00 | NaN | NaN | 3.620119e+01 | -9.470255e+01 | 3.626183e+01 | -9.572557e+01 | 5.618423e-01 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| std | NaN | NaN | 4.875313e-01 | NaN | NaN | 5.076079e+00 | 1.739176e+01 | 5.272905e+00 | 1.810793e+01 | 1.776811e+00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| min | NaN | NaN | 1.000000e+00 | NaN | NaN | 2.455480e+01 | -1.246238e+02 | 2.456601e+01 | -1.245457e+02 | 0.000000e+00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 25% | NaN | NaN | 2.000000e+00 | NaN | NaN | 3.339963e+01 | -1.172194e+02 | 3.346207e+01 | -1.177543e+02 | 0.000000e+00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 50% | NaN | NaN | 2.000000e+00 | NaN | NaN | 3.582397e+01 | -8.776662e+01 | 3.618349e+01 | -8.802789e+01 | 3.000000e-02 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 75% | NaN | NaN | 2.000000e+00 | NaN | NaN | 4.008496e+01 | -8.035368e+01 | 4.017892e+01 | -8.024709e+01 | 4.640000e-01 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| max | NaN | NaN | 4.000000e+00 | NaN | NaN | 4.900220e+01 | -6.711317e+01 | 4.907500e+01 | -6.710924e+01 | 4.417500e+02 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 rows × 46 columns
In [7]:
#Data Characteristics
original_df.isna().sum()
Out[7]:
ID 0 Source 0 Severity 0 Start_Time 0 End_Time 0 Start_Lat 0 Start_Lng 0 End_Lat 3402762 End_Lng 3402762 Distance(mi) 0 Description 5 Street 10869 City 253 County 0 State 0 Zipcode 1915 Country 0 Timezone 7808 Airport_Code 22635 Weather_Timestamp 120228 Temperature(F) 163853 Wind_Chill(F) 1999019 Humidity(%) 174144 Pressure(in) 140679 Visibility(mi) 177098 Wind_Direction 175206 Wind_Speed(mph) 571233 Precipitation(in) 2203586 Weather_Condition 173459 Amenity 0 Bump 0 Crossing 0 Give_Way 0 Junction 0 No_Exit 0 Railway 0 Roundabout 0 Station 0 Stop 0 Traffic_Calming 0 Traffic_Signal 0 Turning_Loop 0 Sunrise_Sunset 23246 Civil_Twilight 23246 Nautical_Twilight 23246 Astronomical_Twilight 23246 dtype: int64
Clean Dataset for Experiment¶
In [9]:
#Remove fields to not be modeled and make values useable for multivariate analysis
GA_DF = original_df.loc[original_df['State'] == 'GA'].drop(columns =['Source',
'ID',
'Description',
'State',
'Street',
'End_Lat',
'End_Lng',
'End_Time',
'City',
'County',
'Country',
'Timezone',
'Zipcode',
'Bump',
'Weather_Condition',
'Airport_Code',
'Wind_Direction',
'Weather_Timestamp',
'Civil_Twilight',
'Nautical_Twilight',
'Traffic_Calming',
'Roundabout',
'Turning_Loop',
'Astronomical_Twilight',
'Sunrise_Sunset',
'Start_Time'],
axis=1
).rename(
columns ={'Distance(mi)' : 'Distance'}
)
with pd.option_context('future.no_silent_downcasting', True):
GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 1]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int) C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\936434541.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 1 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. GA_DF.loc[:,'Amenity':'Traffic_Signal'] = GA_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
In [10]:
#Find missingness relationship
msno.heatmap(GA_DF, cmap='YlGnBu')
Out[10]:
<Axes: >
In [11]:
#Remove Missing
GA_DF = GA_DF.dropna()
In [12]:
#Verify all missing values no longer present
msno.heatmap(GA_DF, cmap='YlGnBu')
C:\Users\Mills\Anaconda\Lib\site-packages\seaborn\matrix.py:309: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. ax.set(xlim=(0, self.data.shape[1]), ylim=(0, self.data.shape[0])) C:\Users\Mills\Anaconda\Lib\site-packages\seaborn\matrix.py:309: UserWarning: Attempting to set identical low and high ylims makes transformation singular; automatically expanding. ax.set(xlim=(0, self.data.shape[1]), ylim=(0, self.data.shape[0]))
Out[12]:
<Axes: >
In [13]:
#Experiment Dataset characteristics
GA_DF.describe(include = 'all')
Out[13]:
| Severity | Start_Lat | Start_Lng | Distance | Temperature(F) | Wind_Chill(F) | Humidity(%) | Pressure(in) | Visibility(mi) | Wind_Speed(mph) | Precipitation(in) | Amenity | Crossing | Give_Way | Junction | No_Exit | Railway | Station | Stop | Traffic_Signal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 |
| mean | 2.401534 | 33.565999 | -83.969520 | 0.840086 | 63.821845 | 63.106559 | 70.141759 | 29.197364 | 8.913840 | 6.161196 | 0.009756 | 0.001583 | 0.040446 | 0.002815 | 0.089359 | 0.000455 | 0.005558 | 0.000880 | 0.006365 | 0.073700 |
| std | 0.636650 | 0.626898 | 0.927210 | 1.992995 | 15.332620 | 16.540394 | 21.879941 | 0.382641 | 2.458774 | 4.608659 | 0.053045 | 0.039762 | 0.197005 | 0.052983 | 0.285262 | 0.021335 | 0.074343 | 0.029647 | 0.079527 | 0.261283 |
| min | 1.000000 | 30.626320 | -85.546465 | 0.000000 | 8.000000 | -10.000000 | 9.000000 | 27.790000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 2.000000 | 33.510780 | -84.437874 | 0.000000 | 52.000000 | 52.000000 | 52.000000 | 28.940000 | 10.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 2.000000 | 33.744621 | -84.334777 | 0.143000 | 65.000000 | 65.000000 | 74.000000 | 29.080000 | 10.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 3.000000 | 33.881139 | -84.048462 | 0.968000 | 75.000000 | 75.000000 | 90.000000 | 29.370000 | 10.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 4.000000 | 34.992405 | -80.852722 | 95.852000 | 140.000000 | 140.000000 | 100.000000 | 30.560000 | 12.000000 | 38.000000 | 1.850000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
In [14]:
#Experiment Dataset characteristics
GA_DF.head(20)
Out[14]:
| Severity | Start_Lat | Start_Lng | Distance | Temperature(F) | Wind_Chill(F) | Humidity(%) | Pressure(in) | Visibility(mi) | Wind_Speed(mph) | Precipitation(in) | Amenity | Crossing | Give_Way | Junction | No_Exit | Railway | Station | Stop | Traffic_Signal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 146516 | 3 | 33.690125 | -84.500153 | 0.01 | 45.0 | 40.6 | 86.0 | 30.16 | 5.0 | 8.1 | 0.09 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146517 | 3 | 33.665138 | -84.418549 | 0.01 | 45.0 | 38.9 | 93.0 | 30.18 | 3.0 | 12.7 | 0.06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146518 | 3 | 33.671810 | -84.328018 | 0.01 | 45.0 | 38.9 | 93.0 | 30.18 | 3.0 | 12.7 | 0.06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146519 | 3 | 33.892895 | -84.260452 | 0.01 | 44.1 | 40.8 | 89.0 | 30.17 | 3.0 | 5.8 | 0.02 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146520 | 3 | 33.690125 | -84.500153 | 0.01 | 45.0 | 40.6 | 86.0 | 30.16 | 5.0 | 8.1 | 0.09 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146521 | 3 | 33.681328 | -84.411522 | 0.01 | 45.0 | 37.6 | 100.0 | 30.12 | 5.0 | 17.3 | 0.10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146522 | 3 | 33.617954 | -84.484985 | 0.01 | 45.0 | 37.6 | 100.0 | 30.12 | 5.0 | 17.3 | 0.10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146523 | 3 | 33.745052 | -84.389732 | 0.01 | 45.0 | 37.6 | 100.0 | 30.12 | 5.0 | 17.3 | 0.10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146524 | 3 | 33.745556 | -84.349213 | 0.01 | 45.0 | 37.6 | 100.0 | 30.12 | 5.0 | 17.3 | 0.10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146525 | 3 | 33.885162 | -84.251266 | 0.01 | 44.1 | 40.1 | 89.0 | 30.15 | 7.0 | 6.9 | 0.06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146526 | 3 | 33.866615 | -84.249062 | 0.01 | 44.1 | 40.1 | 89.0 | 30.15 | 7.0 | 6.9 | 0.06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146528 | 3 | 33.699013 | -84.266167 | 0.01 | 45.0 | 38.5 | 100.0 | 30.14 | 5.0 | 13.8 | 0.10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146529 | 3 | 33.545197 | -84.268410 | 0.01 | 45.0 | 38.9 | 93.0 | 30.14 | 4.0 | 12.7 | 0.12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146530 | 3 | 33.656929 | -84.497757 | 2.37 | 45.0 | 38.9 | 93.0 | 30.14 | 4.0 | 12.7 | 0.12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146531 | 3 | 33.619576 | -84.460335 | 0.01 | 45.0 | 38.9 | 93.0 | 30.14 | 4.0 | 12.7 | 0.12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146533 | 3 | 33.743549 | -84.332092 | 0.01 | 45.0 | 38.9 | 93.0 | 30.14 | 4.0 | 12.7 | 0.12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146535 | 3 | 33.698853 | -84.265923 | 1.67 | 44.1 | 37.4 | 96.0 | 30.15 | 2.0 | 13.8 | 0.08 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146540 | 3 | 33.753635 | -84.495628 | 0.01 | 45.0 | 40.6 | 86.0 | 30.16 | 3.0 | 8.1 | 0.02 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146541 | 2 | 33.699425 | -84.457336 | 0.52 | 44.1 | 37.4 | 96.0 | 30.14 | 3.0 | 13.8 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 146542 | 3 | 33.823265 | -84.355835 | 0.01 | 44.1 | 40.8 | 89.0 | 30.17 | 4.0 | 5.8 | 0.04 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
In [15]:
#Location Overview
levels, categories = pd.factorize(sorted(GA_DF['Severity'], reverse = False))
scatter = plt.scatter(GA_DF['Start_Lng'],GA_DF['Start_Lat'], s=1, c=levels)
plt.legend(scatter.legend_elements()[0], categories, title='Severity')
plt.gca().set(xlabel='Longitude', ylabel='Distance of Road Impacted (mi)', title='Georgias Most Impactful Accidents')
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\2843138273.py:2: FutureWarning: factorize with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version. levels, categories = pd.factorize(sorted(GA_DF['Severity'], reverse = False))
Out[15]:
[Text(0.5, 0, 'Longitude'), Text(0, 0.5, 'Distance of Road Impacted (mi)'), Text(0.5, 1.0, 'Georgias Most Impactful Accidents')]
In [16]:
#Create dependent and Independent variables
X = GA_DF.loc[:, ~GA_DF.columns.isin(['Severity','Distance'])]
Y = GA_DF[['Severity','Distance']]
In [17]:
#Independent variable characteristics
X.describe()
Out[17]:
| Start_Lat | Start_Lng | Temperature(F) | Wind_Chill(F) | Humidity(%) | Pressure(in) | Visibility(mi) | Wind_Speed(mph) | Precipitation(in) | Amenity | Crossing | Give_Way | Junction | No_Exit | Railway | Station | Stop | Traffic_Signal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 | 96622.000000 |
| mean | 33.565999 | -83.969520 | 63.821845 | 63.106559 | 70.141759 | 29.197364 | 8.913840 | 6.161196 | 0.009756 | 0.001583 | 0.040446 | 0.002815 | 0.089359 | 0.000455 | 0.005558 | 0.000880 | 0.006365 | 0.073700 |
| std | 0.626898 | 0.927210 | 15.332620 | 16.540394 | 21.879941 | 0.382641 | 2.458774 | 4.608659 | 0.053045 | 0.039762 | 0.197005 | 0.052983 | 0.285262 | 0.021335 | 0.074343 | 0.029647 | 0.079527 | 0.261283 |
| min | 30.626320 | -85.546465 | 8.000000 | -10.000000 | 9.000000 | 27.790000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 33.510780 | -84.437874 | 52.000000 | 52.000000 | 52.000000 | 28.940000 | 10.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 33.744621 | -84.334777 | 65.000000 | 65.000000 | 74.000000 | 29.080000 | 10.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 33.881139 | -84.048462 | 75.000000 | 75.000000 | 90.000000 | 29.370000 | 10.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 34.992405 | -80.852722 | 140.000000 | 140.000000 | 100.000000 | 30.560000 | 12.000000 | 38.000000 | 1.850000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Assumption Tests¶
In [19]:
#Correlation map
sb.heatmap(X.corr(), vmax=1., square=True)
Out[19]:
<Axes: >
In [20]:
#All independents Q-Q Plot
fig = sm.qqplot(X, line='45')
plt.show()
In [21]:
#Test Multicollinearity
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(X.shape[1])]
vif_data
Out[21]:
| feature | VIF | |
|---|---|---|
| 0 | Start_Lat | 4841.754669 |
| 1 | Start_Lng | 8524.177561 |
| 2 | Temperature(F) | 2201.679062 |
| 3 | Wind_Chill(F) | 1865.221464 |
| 4 | Humidity(%) | 16.054051 |
| 5 | Pressure(in) | 2250.049006 |
| 6 | Visibility(mi) | 19.867626 |
| 7 | Wind_Speed(mph) | 3.735512 |
| 8 | Precipitation(in) | 1.189233 |
| 9 | Amenity | 1.009975 |
| 10 | Crossing | 1.576543 |
| 11 | Give_Way | 1.035104 |
| 12 | Junction | 1.109717 |
| 13 | No_Exit | 1.003336 |
| 14 | Railway | 1.052809 |
| 15 | Station | 1.016101 |
| 16 | Stop | 1.036660 |
| 17 | Traffic_Signal | 1.657900 |
Baseline Model¶
In [23]:
#Split test and train data
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=1)
In [24]:
#Build basic multivariate model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regression_predictions = regr.predict(X_test)
In [25]:
#MSE
mean_squared_error(y_test, regression_predictions)
Out[25]:
2.176909428131628
In [26]:
#MAE
mean_absolute_error(y_test, regression_predictions)
Out[26]:
0.7453151588620005
In [27]:
#R2
r2_score(y_test, regression_predictions)
Out[27]:
0.0299620876438198
PCA¶
In [29]:
#Scaler Transformation
scaler = StandardScaler()
In [30]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
In [31]:
pca = PCA(.95).fit(X_train)
pca.n_components_
Out[31]:
15
In [32]:
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Factors')
plt.ylabel('Variance (%)')
plt.title('Pre-PCA Transformation Explained Variance')
plt.show()
In [33]:
#PCA Transformation
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
In [34]:
#PCA Model
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(X_train,y_train['Severity'].to_numpy())
predictions = logisticRegr.predict(X_test)
In [35]:
#MSE
mean_squared_error(y_test['Severity'], predictions)
Out[35]:
0.5665298237140787
In [36]:
#MAE
mean_absolute_error(y_test['Severity'], predictions)
Out[36]:
0.42301721461344743
In [37]:
#R2
r2_score(y_test['Severity'], predictions)
Out[37]:
-0.39302769823280603
In [48]:
#Model coefficients
pca = PCA()
pca.fit(X)
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio
Out[48]:
array([5.91402289e-01, 3.81855025e-01, 1.95532775e-02, 4.60423023e-03,
1.10073051e-03, 1.02928631e-03, 2.06317484e-04, 8.50440565e-05,
7.36743771e-05, 4.96977398e-05, 2.11041699e-05, 6.27070398e-06,
5.16464128e-06, 2.63330694e-06, 2.40588890e-06, 1.55576579e-06,
8.47049893e-07, 4.46516706e-07])
In [50]:
#Scree Plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('PCA Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(np.arange(1, len(explained_variance_ratio) + 1, 1))
plt.show()
In [52]:
#Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
plt.title('Explained Variance')
plt.show()
In [54]:
#Store components
pca = PCA(n_components=3)
components = pca.fit_transform(X)
In [56]:
#MANOVA
data_pca = pd.concat([Y.reset_index(drop=True), pd.DataFrame(components,columns = ['PC1','PC2','PC3']).reset_index(drop=True)],axis=1)
formula = 'Severity + Distance ~ PC1 + PC2 +PC3'
manova_pca = MANOVA.from_formula(formula, data = data_pca)
results = manova_pca.mv_test()
print(results)
Multivariate linear model
====================================================================
--------------------------------------------------------------------
Intercept Value Num DF Den DF F Value Pr > F
--------------------------------------------------------------------
Wilks' lambda 0.0634 2.0000 96617.0000 713623.0480 0.0000
Pillai's trace 0.9366 2.0000 96617.0000 713623.0480 0.0000
Hotelling-Lawley trace 14.7722 2.0000 96617.0000 713623.0480 0.0000
Roy's greatest root 14.7722 2.0000 96617.0000 713623.0480 0.0000
--------------------------------------------------------------------
--------------------------------------------------------------------
PC1 Value Num DF Den DF F Value Pr > F
--------------------------------------------------------------------
Wilks' lambda 0.9968 2.0000 96617.0000 153.3970 0.0000
Pillai's trace 0.0032 2.0000 96617.0000 153.3970 0.0000
Hotelling-Lawley trace 0.0032 2.0000 96617.0000 153.3970 0.0000
Roy's greatest root 0.0032 2.0000 96617.0000 153.3970 0.0000
--------------------------------------------------------------------
---------------------------------------------------------------------
PC2 Value Num DF Den DF F Value Pr > F
---------------------------------------------------------------------
Wilks' lambda 0.9987 2.0000 96617.0000 65.1561 0.0000
Pillai's trace 0.0013 2.0000 96617.0000 65.1561 0.0000
Hotelling-Lawley trace 0.0013 2.0000 96617.0000 65.1561 0.0000
Roy's greatest root 0.0013 2.0000 96617.0000 65.1561 0.0000
--------------------------------------------------------------------
---------------------------------------------------------------------
PC3 Value Num DF Den DF F Value Pr > F
---------------------------------------------------------------------
Wilks' lambda 0.9998 2.0000 96617.0000 11.4135 0.0000
Pillai's trace 0.0002 2.0000 96617.0000 11.4135 0.0000
Hotelling-Lawley trace 0.0002 2.0000 96617.0000 11.4135 0.0000
Roy's greatest root 0.0002 2.0000 96617.0000 11.4135 0.0000
====================================================================
In [58]:
#PC1 vs PC2
fig = px.scatter(components, x=0, y=1, color=GA_DF['Severity'],labels={
"0": "PC1",
"1": "PC2",
"color": "Severity"
},
title="PCA Scatterplot (PC1 vs. PC2)")
fig.show()
In [59]:
#PC1 vs PC3
fig = px.scatter(components, x=0, y=2, color=GA_DF['Severity'],labels={
"0": "PC1",
"2": "PC3",
"color": "Severity"
},
title="PCA Scatterplot (PC1 vs. PC3)")
fig.show()
In [61]:
#PC2 vs PC3
fig = px.scatter(components, x=1, y=2, color=GA_DF['Severity'],labels={
"1": "PC2",
"2": "PC3",
"color": "Severity"
},
title="PCA Scatterplot (PC2 vs. PC3)")
fig.show()
In [62]:
# 3D Plot
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
components, x=0, y=1, z=2, color=GA_DF['Severity'],
title=f'Total Explained Variance: {total_var:.2f}%',
labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()
In [63]:
#Loadings Plot
loadings = pca.components_
for i in range(3):
plt.plot(loadings[i], label=f'PC {i+1}', marker='o')
plt.title('Loading Plot')
plt.xlabel('Features')
plt.ylabel('Loading Value')
plt.ylim(-1, 1)
plt.legend()
plt.grid(True)
plt.show()
In [65]:
# Sum of loadings
loadings[0]+loadings[1]+loadings[2]
Out[65]:
array([-7.34207344e-03, 1.58423167e-03, 2.19539299e-02, -1.17291688e-01,
1.46574046e+00, -1.10696692e-02, -1.42066332e-01, 9.03878257e-01,
2.26310643e-03, -1.08027636e-05, -4.29280601e-04, -2.72007040e-05,
1.10087902e-03, 1.28605669e-05, 2.15818864e-05, -3.38563052e-05,
-2.88299942e-04, -1.13891322e-03])
In [67]:
# Q-Q plots against normal distribution
%matplotlib tk
fig, axes = plt.subplots(6, 3, figsize=(15, 5), layout = 'constrained')
axes = axes.flatten()
for i, col in enumerate(X.columns):
stats.probplot(GA_DF[col], dist="norm", plot=axes[i])
axes[i].set_title(f"Q-Q Plot of {col}")
plt.show()
LASSO Model¶
In [73]:
#LASSO train and test scaler transform
LassoX_train = scaler.fit_transform(X_train)
LassoX_test = scaler.transform(X_test)
In [75]:
#Lasso model
model = Lasso()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = MultiTaskLassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1).fit(LassoX_train, y_train)
print('alpha: %f' % model.alpha_)
C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: UserWarning: Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged. C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 131935.87954872157, tolerance: 26.818999514559525 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 135542.32259107713, tolerance: 27.543266845558065 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 131321.56251747793, tolerance: 26.69405847159424 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 134038.86828425183, tolerance: 27.23777233103883 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 126686.78579102649, tolerance: 25.764757318567447 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 125841.04588310918, tolerance: 25.59711961904939 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 135811.37739642186, tolerance: 27.59570762476094 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 134501.771932224, tolerance: 27.332988521505126 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 132562.1790970095, tolerance: 26.947903828166414 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 135460.82148482272, tolerance: 27.530134415960905 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 132519.93085208983, tolerance: 26.940069947787528 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 128314.55720768594, tolerance: 26.092418209251296 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 127696.63589309895, tolerance: 25.971860454500035 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 130727.47302150655, tolerance: 26.578294114195913 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 135783.62956220293, tolerance: 27.59398324902429 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 134326.5283312325, tolerance: 27.297875403834265 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 124319.30969121488, tolerance: 25.29158977840956 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 132890.42456852185, tolerance: 27.01240581898566 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 125846.97074779686, tolerance: 25.5947860975266 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 132990.51679790937, tolerance: 27.025420399330663 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 126789.84782092765, tolerance: 25.792506464047918 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 126005.54226547966, tolerance: 25.630746942442663 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 130140.40727755838, tolerance: 26.454039466686712 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 127436.04607645723, tolerance: 25.917250174170572 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 132859.06248942617, tolerance: 27.005384722123807 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 124236.74305945201, tolerance: 25.28099105235222 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 128376.38093085267, tolerance: 26.103525850683134 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: UserWarning: Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged. C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 125294.03780777332, tolerance: 25.486902933789388 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 130910.63845642326, tolerance: 26.61761739496349 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:675: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 126840.30866713515, tolerance: 25.79445105932087 C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:2559: UserWarning: Coordinate descent with l1_reg=0 may lead to unexpected results and is discouraged.
alpha: 0.000000
C:\Users\Mills\Anaconda\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:2559: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 144743.29109283234, tolerance: 29.427627354713856
In [78]:
#Prediction to test MSE et all
predictions = model.predict(LassoX_test)
In [80]:
#MSE
mean_squared_error(y_test, predictions)
Out[80]:
2.1847991973425933
In [82]:
#MAE
mean_absolute_error(y_test, predictions)
Out[82]:
0.7488280364373681
In [84]:
#R2
r2_score(y_test, predictions)
Out[84]:
0.02204012759790447
All of United States for Comparison¶
In [86]:
#Apply same data cleaning as GA dataset
US_DF = original_df.loc[original_df['State'] != 'GA'].dropna().drop(columns =['Source',
'ID',
'Description',
'State',
'Street',
'End_Lat',
'End_Lng',
'End_Time',
'City',
'County',
'Country',
'Timezone',
'Zipcode',
'Bump',
'Weather_Condition',
'Airport_Code',
'Wind_Direction',
'Weather_Timestamp',
'Civil_Twilight',
'Nautical_Twilight',
'Traffic_Calming',
'Roundabout',
'Turning_Loop',
'Astronomical_Twilight',
'Sunrise_Sunset',
'Start_Time'],
axis=1
).rename(
columns ={'Distance(mi)' : 'Distance'}
)
with pd.option_context('future.no_silent_downcasting', True):
US_DF.loc[:,'Amenity':'Traffic_Signal'] = US_DF.loc[:,'Amenity':'Traffic_Signal'].astype(int)
C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 1 ... 1 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. C:\Users\Mills\AppData\Local\Temp\ipykernel_14768\1931142849.py:33: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0 0 0 ... 0 0 0]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
In [88]:
#Split test and training
X_all = US_DF.loc[:, ~US_DF.columns.isin(['Severity','Distance'])]
Y_all = US_DF[['Severity','Distance']]
In [90]:
#Predictions to test MSE et all
all_predictions = regr.predict(X_all)
In [92]:
#MSE
mean_squared_error(Y_all, all_predictions)
Out[92]:
7.551554730006793
In [94]:
#MAE
mean_absolute_error(Y_all, all_predictions)
Out[94]:
2.1411786002666657
In [96]:
#R2
r2_score(Y_all, all_predictions)
Out[96]:
-27.553697665435195